本案例將利用R來重製華盛頓郵報在2016/08/13的一篇談論美國婦女產假支薪情形的報導。這個案例中將會應用到data.frame和基本的繪圖與資料摘要方法。The case adaped Washington Post’s paid maternity leave as an exmaple to introduce basic skill of data.frame, plotting, and data mamipulation.
原始新聞來源:https://www.washingtonpost.com/news/worldviews/wp/2016/08/13/the-world-is-getting-better-at-paid-maternity-leave-the-u-s-is-not/?tid=sm_tw&utm_term=.f8cd50280326#comments
library(readxl)
options(stringsAsFactors = FALSE)
ldata <- read_excel("data/WORLD-MACHE_Gender_6.8.15.xls", "Sheet1", col_names=T)
# select columns by index
matleave <- ldata[ , c(3, 6:24)]
str(matleave)
## Classes 'tbl_df', 'tbl' and 'data.frame': 197 obs. of 20 variables:
## $ iso3 : chr "AFG" "ALB" "DZA" "AND" ...
## $ matleave_95: num 2 5 3 2 2 2 2 3 1 5 ...
## $ matleave_96: num 2 5 3 2 2 2 2 3 1 5 ...
## $ matleave_97: num 2 5 3 2 2 2 2 3 1 5 ...
## $ matleave_98: num 2 5 3 2 2 2 2 3 1 5 ...
## $ matleave_99: num 2 5 3 2 2 2 2 3 1 5 ...
## $ matleave_00: num 2 5 3 3 2 2 2 3 1 5 ...
## $ matleave_01: num 2 5 3 3 2 2 2 3 1 5 ...
## $ matleave_02: num 2 5 3 3 2 2 2 3 1 5 ...
## $ matleave_03: num 2 5 3 3 2 2 2 3 1 5 ...
## $ matleave_04: num 2 5 3 3 2 2 2 5 1 5 ...
## $ matleave_05: num 2 5 3 3 2 2 2 5 1 5 ...
## $ matleave_06: num 2 5 3 3 2 2 2 5 1 5 ...
## $ matleave_07: num 2 5 3 3 2 2 2 5 1 5 ...
## $ matleave_08: num 2 5 3 3 2 2 2 5 1 5 ...
## $ matleave_09: num 2 5 3 3 2 2 2 5 1 5 ...
## $ matleave_10: num 2 5 3 3 2 2 2 5 NA 5 ...
## $ matleave_11: num 2 5 3 3 2 2 2 5 3 5 ...
## $ matleave_12: num 2 5 3 3 2 2 2 5 3 5 ...
## $ matleave_13: num 2 5 3 3 2 2 2 5 3 5 ...
# select all NA cells and assign 0 to them
matleave[is.na(matleave)] <- 0
# filter rows by condition
m5 <- matleave[matleave$'matleave_13' == 5, ]
# filter rows by condition
m55<- m5[m5$'matleave_95' == 5,]
# plot
par(mfrow=c(4,6), mai= c(0.2, 0.2, 0.2, 0.2))
for (i in c(1:nrow(m55))){
barplot(unlist(m55[i,-1]),
border=NA, space=0,xaxt="n", yaxt="n", ylim = c(0,5))
title(m55[i,1], line = -4, cex.main=3)
}
?read_excel查詢一下可以怎麼用。read_excel() convert a sheet to a data.frame ``` read_excel(path, sheet = NULL, range = NULL, col_names = TRUE, col_types = NULL, na = “”, trim_ws = TRUE, skip = 0, n_max = Inf, guess_max = min(1000, n_max))```
ldata <- read_excel("data/WORLD-MACHE_Gender_6.8.15.xls", "Sheet1", col_names=T)
# View(ldata)
class(ldata) # [1] "tbl_df" "tbl" "data.frame"
## [1] "tbl_df" "tbl" "data.frame"
dim(ldata)
## [1] 197 156
# Show names of variables (vectors, columns) by names()
names(ldata)
## [1] "country" "iso2" "iso3"
## [4] "region" "wb_econ" "matleave_95"
## [7] "matleave_96" "matleave_97" "matleave_98"
## [10] "matleave_99" "matleave_00" "matleave_01"
## [13] "matleave_02" "matleave_03" "matleave_04"
## [16] "matleave_05" "matleave_06" "matleave_07"
## [19] "matleave_08" "matleave_09" "matleave_10"
## [22] "matleave_11" "matleave_12" "matleave_13"
## [25] "matleave_wrr_95" "matleave_wrr_96" "matleave_wrr_97"
## [28] "matleave_wrr_98" "matleave_wrr_99" "matleave_wrr_00"
## [31] "matleave_wrr_01" "matleave_wrr_02" "matleave_wrr_03"
## [34] "matleave_wrr_04" "matleave_wrr_05" "matleave_wrr_06"
## [37] "matleave_wrr_07" "matleave_wrr_08" "matleave_wrr_09"
## [40] "matleave_wrr_10" "matleave_wrr_11" "matleave_wrr_12"
## [43] "matleave_wrr_13" "bf_dur_95" "bf_dur_96"
## [46] "bf_dur_97" "bf_dur_98" "bf_dur_99"
## [49] "bf_dur_00" "bf_dur_01" "bf_dur_02"
## [52] "bf_dur_03" "bf_dur_04" "bf_dur_05"
## [55] "bf_dur_06" "bf_dur_07" "bf_dur_08"
## [58] "bf_dur_09" "bf_dur_10" "bf_dur_11"
## [61] "bf_dur_12" "bf_dur_13" "mat_bfeed_6mon_95"
## [64] "mat_bfeed_6mon_96" "mat_bfeed_6mon_97" "mat_bfeed_6mon_98"
## [67] "mat_bfeed_6mon_99" "mat_bfeed_6mon_00" "mat_bfeed_6mon_01"
## [70] "mat_bfeed_6mon_02" "mat_bfeed_6mon_03" "mat_bfeed_6mon_04"
## [73] "mat_bfeed_6mon_05" "mat_bfeed_6mon_06" "mat_bfeed_6mon_07"
## [76] "mat_bfeed_6mon_08" "mat_bfeed_6mon_09" "mat_bfeed_6mon_10"
## [79] "mat_bfeed_6mon_11" "mat_bfeed_6mon_12" "mat_bfeed_6mon_13"
## [82] "minage_fem_leg_95" "minage_fem_leg_96" "minage_fem_leg_97"
## [85] "minage_fem_leg_98" "minage_fem_leg_99" "minage_fem_leg_00"
## [88] "minage_fem_leg_01" "minage_fem_leg_02" "minage_fem_leg_03"
## [91] "minage_fem_leg_04" "minage_fem_leg_05" "minage_fem_leg_06"
## [94] "minage_fem_leg_07" "minage_fem_leg_08" "minage_fem_leg_09"
## [97] "minage_fem_leg_10" "minage_fem_leg_11" "minage_fem_leg_12"
## [100] "legal_diff_leg_95" "legal_diff_leg_96" "legal_diff_leg_97"
## [103] "legal_diff_leg_98" "legal_diff_leg_99" "legal_diff_leg_00"
## [106] "legal_diff_leg_01" "legal_diff_leg_02" "legal_diff_leg_03"
## [109] "legal_diff_leg_04" "legal_diff_leg_05" "legal_diff_leg_06"
## [112] "legal_diff_leg_07" "legal_diff_leg_08" "legal_diff_leg_09"
## [115] "legal_diff_leg_10" "legal_diff_leg_11" "legal_diff_leg_12"
## [118] "minage_fem_pc_95" "minage_fem_pc_96" "minage_fem_pc_97"
## [121] "minage_fem_pc_98" "minage_fem_pc_99" "minage_fem_pc_00"
## [124] "minage_fem_pc_01" "minage_fem_pc_02" "minage_fem_pc_03"
## [127] "minage_fem_pc_04" "minage_fem_pc_05" "minage_fem_pc_06"
## [130] "minage_fem_pc_07" "minage_fem_pc_08" "minage_fem_pc_09"
## [133] "minage_fem_pc_10" "minage_fem_pc_11" "minage_fem_pc_12"
## [136] "legal_diff_pc_95" "legal_diff_pc_96" "legal_diff_pc_97"
## [139] "legal_diff_pc_98" "legal_diff_pc_99" "legal_diff_pc_00"
## [142] "legal_diff_pc_01" "legal_diff_pc_02" "legal_diff_pc_03"
## [145] "legal_diff_pc_04" "legal_diff_pc_05" "legal_diff_pc_06"
## [148] "legal_diff_pc_07" "legal_diff_pc_08" "legal_diff_pc_09"
## [151] "legal_diff_pc_10" "legal_diff_pc_11" "legal_diff_pc_12"
## [154] "minwage_ppp_2013" "mw_overtime" "oecd"
matleave <- ldata[ , c(3, 6:24)]
class(matleave)
## [1] "tbl_df" "tbl" "data.frame"
dim(matleave)
## [1] 197 20
str(matleave)
## Classes 'tbl_df', 'tbl' and 'data.frame': 197 obs. of 20 variables:
## $ iso3 : chr "AFG" "ALB" "DZA" "AND" ...
## $ matleave_95: num 2 5 3 2 2 2 2 3 1 5 ...
## $ matleave_96: num 2 5 3 2 2 2 2 3 1 5 ...
## $ matleave_97: num 2 5 3 2 2 2 2 3 1 5 ...
## $ matleave_98: num 2 5 3 2 2 2 2 3 1 5 ...
## $ matleave_99: num 2 5 3 2 2 2 2 3 1 5 ...
## $ matleave_00: num 2 5 3 3 2 2 2 3 1 5 ...
## $ matleave_01: num 2 5 3 3 2 2 2 3 1 5 ...
## $ matleave_02: num 2 5 3 3 2 2 2 3 1 5 ...
## $ matleave_03: num 2 5 3 3 2 2 2 3 1 5 ...
## $ matleave_04: num 2 5 3 3 2 2 2 5 1 5 ...
## $ matleave_05: num 2 5 3 3 2 2 2 5 1 5 ...
## $ matleave_06: num 2 5 3 3 2 2 2 5 1 5 ...
## $ matleave_07: num 2 5 3 3 2 2 2 5 1 5 ...
## $ matleave_08: num 2 5 3 3 2 2 2 5 1 5 ...
## $ matleave_09: num 2 5 3 3 2 2 2 5 1 5 ...
## $ matleave_10: num 2 5 3 3 2 2 2 5 NA 5 ...
## $ matleave_11: num 2 5 3 3 2 2 2 5 3 5 ...
## $ matleave_12: num 2 5 3 3 2 2 2 5 3 5 ...
## $ matleave_13: num 2 5 3 3 2 2 2 5 3 5 ...
NA: Not Availablev[is.na(v)] will select all NA cellssum(is.na(matleave))的目的是檢測還有沒有NA值。如果有的話is.na()就會是TRUE,那麼加總後,如果不是0,那就代表還有NA。matleave[is.na(matleave)] <- 0
# checks if there are still NA cells.
anyNA(matleave)
## [1] FALSE
sum(is.na(matleave))
## [1] 0
unlist()?請試著執行barplot(matleave[2, -1])這個沒有unlist()的版本,看看會有什麼錯誤訊息。資料結構有何差異呢?class()或str()嘗試觀察沒有unlist()版本的資料,看看資料型態和有unlist()的會有何不同?barplot(unlist(matleave[2, -1]))
# Take a look at the data type of matleave[2, ]
class(matleave[2, -1])
## [1] "tbl_df" "tbl" "data.frame"
# Using unlist() to convert a single row dataframe to a vector
unlist(matleave[2, -1])
## matleave_95 matleave_96 matleave_97 matleave_98 matleave_99 matleave_00
## 5 5 5 5 5 5
## matleave_01 matleave_02 matleave_03 matleave_04 matleave_05 matleave_06
## 5 5 5 5 5 5
## matleave_07 matleave_08 matleave_09 matleave_10 matleave_11 matleave_12
## 5 5 5 5 5 5
## matleave_13
## 5
class(unlist(matleave[2, -1]))
## [1] "numeric"
# Add more arguments
barplot(unlist(matleave[2, -1]))
barplot(unlist(matleave[2, -1]), ylim=c(0, 5))
barplot(unlist(matleave[2, -1]), ylim=c(0, 5), space=0)
barplot(unlist(matleave[2, -1]), ylim=c(0, 5), space=0, border=NA)
barplot(unlist(matleave[2, -1]), ylim=c(0, 5), space=0, border=NA, xaxt="n", yaxt="n")
# View(matleave[1]) # select the 1st variable
# View(matleave[ ,1]) # select the 1st column
# View(matleave[1, ]) # select the 1st row
class(matleave[1]) # "tbl_df" "tbl" "data.frame"
## [1] "tbl_df" "tbl" "data.frame"
class(matleave[ ,1]) # "tbl_df" "tbl" "data.frame"
## [1] "tbl_df" "tbl" "data.frame"
class(matleave[1, ]) # "tbl_df" "tbl" "data.frame"
## [1] "tbl_df" "tbl" "data.frame"
class(matleave$iso3) # character (vector)
## [1] "character"
matleave內的索引由1被列出至6。因此,最好的方法是用迴圈(for-loop)的方式將相同的程式碼,從1~6之間做六次。barplot(unlist(matleave[1, -1]), ylim=c(0, 5), space=0, border=NA, xaxt="n", yaxt="n")
barplot(unlist(matleave[2, -1]), ylim=c(0, 5), space=0, border=NA, xaxt="n", yaxt="n")
barplot(unlist(matleave[3, -1]), ylim=c(0, 5), space=0, border=NA, xaxt="n", yaxt="n")
barplot(unlist(matleave[4, -1]), ylim=c(0, 5), space=0, border=NA, xaxt="n", yaxt="n")
barplot(unlist(matleave[5, -1]), ylim=c(0, 5), space=0, border=NA, xaxt="n", yaxt="n")
barplot(unlist(matleave[6, -1]), ylim=c(0, 5), space=0, border=NA, xaxt="n", yaxt="n")
for(i in 1:6){
barplot(unlist(matleave[i, -1]), ylim=c(0, 5), space=0, border=NA, xaxt="n", yaxt="n")
}
Check ?par to get paremeters of plotting
**mai**: A numerical vector of the form c(bottom, left, top, right) which gives the margin size specified in inches.
**mfcol, mfrow**:A vector of the form c(nr, nc). Subsequent figures will be drawn in an nr-by-nc array on the device by columns (mfcol), or rows (mfrow), respectively.
par(mfrow=c(3,2), mai= c(0.2, 0.2, 0.2, 0.2))
for(i in 1:6){
barplot(unlist(matleave[i, -1]), ylim=c(0, 5), space=0, border=NA, xaxt="n", yaxt="n")
}
# plot more rows to see what happens
par(mfrow=c(3,2), mai= c(0.2, 0.2, 0.2, 0.2))
for(i in 1:10){
barplot(unlist(matleave[i, -1]), ylim=c(0, 5), space=0, border=NA, xaxt="n", yaxt="n")
}
# plot all subplots in a figure
m5 <- matleave[matleave$'matleave_13'==5, ]
nrow(m5)
## [1] 34
# matleave$'matleave_13'
# matleave$'matleave_13'==5
# length(matleave$'matleave_13'==5)
# filter rows whose 'matleave_95' is 5
m55<- m5[m5$'matleave_95'==5,]
# filter rows whose 'matleave_95' is not 5
m05<- m5[m5$'matleave_95'!=5,]
nrow(m55)
## [1] 18
par(mfrow=c(4, 6), mai= c(0.2, 0.2, 0.2, 0.2))
for (i in 1:nrow(m55)){
barplot(unlist(m55[i, -1]), border=NA, space=0, xaxt="n", yaxt="n", ylim = c(0,5))
}
# How do you add title for each subplot?
par(mfrow=c(4,6), mai= c(0.2, 0.2, 0.2, 0.2))
for (i in 1:nrow(m55)){
barplot(unlist(m55[i, -1]), border=NA, space=0,xaxt="n", yaxt="n", ylim = c(0,5))
title(m55[i,1], line = -4, cex.main=3)
}
# plotting matleave_95 != 5 but matleve_13 == 5
# plotting for matleave_13 == 4
# Select, and filter data in dplyr form
library(dplyr)
m55 <- ldata %>%
select(iso3, contains("matleave"), -contains("wrr")) %>%
mutate_if(is.numeric, function(x){ifelse(is.na(x), 0, x)}) %>%
filter(matleave_13==5, matleave_95==5)
library(tidyr)
# Gather 2:20 column to a new variable "year"
# Name level data as "degree"
long_form <- gather(matleave, "year", "degree", 2:20)
# install.packages("rworldmap")
library(rworldmap)# drawing worldmap
# select cols
mdata <- ldata[,c(3, 6:24)]
# join your data with the world map data
myMap <- joinCountryData2Map(mdata, joinCode = "ISO3", nameJoinColumn = "iso3")
## 196 codes from your data successfully matched countries in the map
## 1 codes from your data failed to match with a country code in the map
## 47 codes from the map weren't represented in your data
# 196 codes from your data successfully matched countries in the map
# 1 codes from your data failed to match with a country code in the map
# 47 codes from the map weren't represented in your data
myMap$matleave_13
## [1] 2 2 5 2 2 5 NA NA 3 5 5 2 4 3 3 3 5 2 5 5 3 2 3
## [24] 3 2 2 3 4 3 4 3 3 3 3 3 3 3 5 NA 3 5 5 3 5 2 3
## [47] 2 2 2 3 5 2 5 2 NA 4 3 4 3 2 3 4 2 2 4 NA 2 2 2
## [70] 5 2 5 2 2 4 4 2 4 3 4 2 2 5 3 2 3 2 5 NA 2 2 2
## [93] 2 3 2 2 5 4 5 3 5 3 2 4 3 2 5 5 2 3 2 2 2 NA 3
## [116] 2 2 3 4 2 3 2 2 3 2 2 1 5 NA 2 4 2 2 5 5 2 NA 2
## [139] 2 2 3 2 2 2 3 5 1 5 5 5 2 3 3 3 2 5 3 2 3 2 3
## [162] NA 2 2 5 2 1 5 4 4 2 NA 2 3 3 3 NA NA NA 3 NA NA 2 2
## [185] NA NA 2 2 3 2 NA NA 2 NA 1 NA NA 2 NA NA NA NA NA NA NA NA 2
## [208] 2 2 3 NA NA 3 2 1 3 NA NA 2 NA 1 1 NA 1 NA 3 NA NA 5 NA
## [231] 2 NA 3 NA 1 5 2 NA NA NA 2 2 NA
# Draw world maps
dev.off()
## RStudioGD
## 2
mapCountryData(myMap
, nameColumnToPlot="matleave_13"
, catMethod = "categorical"
)
# self-defined colors
colors <- c("#FF8000", "#A9D0F5", "#58ACFA", "#0080FF", "#084B8A")
mapCountryData(myMap
, nameColumnToPlot="matleave_13"
, catMethod = "categorical"
, colourPalette = colors
, addLegend="FALSE"
)
par() for plotting as subplotspar(mfrow=c(4,5), mai= c(0.2, 0.2, 0.2, 0.2))
for(i in 51:69){
mapCountryData(myMap
, nameColumnToPlot=names(myMap)[i]
, catMethod = "categorical"
, colourPalette = colors
, addLegend="FALSE"
)
}
# select 1 to 24 vectors
tdata <- ldata[ ,1:24]
names(tdata)
## [1] "country" "iso2" "iso3" "region" "wb_econ"
## [6] "matleave_95" "matleave_96" "matleave_97" "matleave_98" "matleave_99"
## [11] "matleave_00" "matleave_01" "matleave_02" "matleave_03" "matleave_04"
## [16] "matleave_05" "matleave_06" "matleave_07" "matleave_08" "matleave_09"
## [21] "matleave_10" "matleave_11" "matleave_12" "matleave_13"
# deal with NAs
tdata[is.na(tdata)] <- 0
# create contigency table by region for matleave_13: data count by length()
num <- tapply(ldata$matleave_13, ldata$region, length)
# tapply() using mean(), sum(), and sd
total <- tapply(ldata$matleave_13, ldata$region, mean)
average <- tapply(ldata$matleave_13, ldata$region, sum)
sd <- tapply(ldata$matleave_13, ldata$region, sd)
# create data frame for about variables
res <- data.frame(num, average, total, sd)
# View(res)
byregion <- aggregate(tdata[,6:24], by=list(tdata$region), mean)
# View(byregion)
byregion.sd <- aggregate(tdata[,6:24], by=list(tdata$region), sd)
?aggregate
head(byregion)
## Group.1 matleave_95 matleave_96 matleave_97
## 1 Americas 2.171429 2.171429 2.171429
## 2 East Asia & Pacific 1.750000 1.750000 1.750000
## 3 Europe & Central Asia 3.272727 3.272727 3.400000
## 4 Middle East & North Africa 2.000000 2.000000 2.000000
## 5 South Asia 1.625000 1.625000 1.625000
## 6 Sub-Saharan Africa 2.208333 2.208333 2.229167
## matleave_98 matleave_99 matleave_00 matleave_01 matleave_02 matleave_03
## 1 2.171429 2.142857 2.200000 2.20000 2.200000 2.257143
## 2 1.750000 1.750000 1.750000 1.84375 1.937500 2.000000
## 3 3.381818 3.436364 3.581818 3.60000 3.654545 3.872727
## 4 2.000000 2.000000 2.000000 2.00000 2.157895 2.157895
## 5 1.625000 1.625000 1.625000 1.62500 1.625000 1.625000
## 6 2.229167 2.187500 2.270833 2.31250 2.333333 2.333333
## matleave_04 matleave_05 matleave_06 matleave_07 matleave_08 matleave_09
## 1 2.257143 2.257143 2.257143 2.257143 2.257143 2.257143
## 2 2.000000 2.031250 2.031250 2.031250 2.031250 2.062500
## 3 3.963636 4.018182 4.109091 4.127273 4.200000 4.218182
## 4 2.263158 2.263158 2.263158 2.263158 2.315789 2.368421
## 5 1.625000 1.625000 1.750000 1.625000 1.875000 2.125000
## 6 2.333333 2.354167 2.395833 2.395833 2.395833 2.395833
## matleave_10 matleave_11 matleave_12 matleave_13
## 1 2.257143 2.314286 2.371429 2.371429
## 2 2.031250 2.156250 2.250000 2.281250
## 3 4.254545 4.254545 4.272727 4.309091
## 4 2.421053 2.421053 2.421053 2.421053
## 5 2.125000 2.125000 2.125000 2.125000
## 6 2.458333 2.500000 2.500000 2.500000
# Line ploting the 1st row
dev.off()
## RStudioGD
## 2
plot(unlist(byregion[1,]), type="o")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 強制變更過程中產生了 NA
# Line ploting the 2nd~6th rows
plot(unlist(byregion[2,]), type="o")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 強制變更過程中產生了 NA
plot(unlist(byregion[3,]), type="o")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 強制變更過程中產生了 NA
plot(unlist(byregion[4,]), type="o")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 強制變更過程中產生了 NA
plot(unlist(byregion[5,]), type="o")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 強制變更過程中產生了 NA
plot(unlist(byregion[6,]), type="o")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 強制變更過程中產生了 NA
# Using for-loop and par() to plot all graph
par(mfrow=c(3,2), mai= rep(0.3, 4))
for (i in 1:6){
plot(unlist(byregion[i,-1]), type="o", ylim=c(0, 5))
title(byregion[i,1])
}
# staySame version
# staySame <- apply(m5[,2:20], 1, function(x) length(unique(x[!is.na(x)])))
# m55 <- m5[staySame, ]
# m50 <- m5[!staySame, ]
library(dplyr)
library(tidyr)
library(ggplot2)
long_form <- gather(matleave, "year", "degree", 2:20)
matleave <- ldata %>%
select(iso3, contains("matleave"), -contains("wrr")) %>%
filter(matleave_13==5, matleave_95==5) %>%
gather("year", "degree", 2:20) %>%
replace_na(list(degree=0)) %>%
mutate(year2=as.POSIXct(strptime(year, "matleave_%y"))) %>%
mutate(year3 = strftime(year2, "%Y"))
matleave %>%
ggplot() +
aes(year3, degree) +
facet_grid(iso3~.) +
geom_bar(stat = "identity", fill = "blue")